library(pacman) # pacman installs missing libraries automagically
pacman::p_load(tidyverse, Amelia, copula, MASS, parallel)

studentize = function(k) (k - mean(k)) / sd(k)
random_numbers = function(N, copula = 'Normal'){
	if (copula == 'Normal') {
        S = matrix(1/3, ncol=3, nrow=3) # covariances 1/3
        diag(S) = 1 # variances 1
		mu = rep(0, times=ncol(S)) # mean 0 
		dat = mvrnorm(N, mu, S) # random numbers
	} else {
        if (copula == 'Gumbel') {
		    myCop = gumbelCopula(1.8, dim = 3) # arbitrary tuning parameter
	    } else if (copula == 'Clayton') {
		    myCop = claytonCopula(1.5, dim = 3) 
	    } else if (copula == 'Frank') {
		    myCop = frankCopula(5.4, dim = 3) 
        } else {
            stop('copula must be Gumbel, Clayton, or Frank')
        }
        margins = c('norm', 'norm', 'norm') # normal marginal distributions 
    	paramMargins = list(list(), list(), list())  # empty list -> rnorm(mean=0, sd=1)
		myMvdc = mvdc(copula = myCop, margins = margins, paramMargins = paramMargins) # copula set up 
		dat = rMvdc(N, myMvdc) # random numbers
		dat = apply(dat, 2, studentize)
    }
    dat = data.frame(dat) 
	colnames(dat) = c('y', 'x1', 'x2') 
	return(dat)
}

simdata = function(parameters){
	dat = random_numbers(parameters$N, 
                         parameters$copula)
    pr = plogis(parameters$theta_1 + # probability of incomplete case
                parameters$theta_y * dat$y + # effect of y on missingness
                parameters$theta_x * dat$x1 + 
                parameters$theta_x * dat$x2)
    missing = as.logical(rbinom(parameters$N, 1, pr)) # missing index
    dat_miss_x1 = dat_miss_x2 = dat_miss_y = dat
    dat_miss_y$y[missing] = NA # missingness in the DV
    dat_miss_x1$x1[missing] = NA # missingness in the variable of interest
    dat_miss_x2$x2[missing] = NA # missingness in the control variable
    out = list('dat_full'=dat, 
               'dat_lwd'=dat_miss_y,
               'dat_miss_y_amelia' = amelia(dat_miss_y, m = 10, p2s=0)$imputations,
               'dat_miss_x1_amelia' = amelia(dat_miss_x1, m = 10, p2s=0)$imputations,
               'dat_miss_x2_amelia' = amelia(dat_miss_x2, m = 10, p2s=0)$imputations)
    return(out)
}

fit = function(data) UseMethod("fit", data)
fit.data.frame = function(data){ # Fit to data.frame (dat_full or dat_lwd)
    model = lm(y ~ x1 + x2, data)
    out = coef(model)['x1']
    return(out)
}
fit.list = function(data){ # Fit to imputed data, which come as a list of DFs
    models = lapply(data, function(k) lm(y ~ x1 + x2, data=k)) # Fit in the imputed datasets
    beta = sapply(models, function(k) coef(k)['x1']) # Extract coefficient of interest
    out = mean(beta) # Rubin's rule: Take the mean of the estimated x1 coefficients
    return(out)
}

.montecarlo = function(parameters){ # conduct a single monte carlo replication
    dat = simdata(parameters) # artificial data
    out = sapply(dat, fit) # fit models
    names(out) = gsub('\\.x\\d', '', names(out)) # cleanup model names
    missing_share = mean(is.na(dat$dat_lwd$y)) # share of incomplete observations
    out = c(out, 'missing_share' = missing_share)
    return(out)
}
montecarlo = function(parameters){ # conduct a bunch of monte carlo replications
    RNGkind("L'Ecuyer-CMRG") # in principle, this should make parallel computation replicable
    set.seed(parameters$seed)
    out = mclapply(1:parameters$M, # repeat the monte carlo experiment M times
                   function(k) .montecarlo(parameters), # this can take a long time
                   mc.cores=parameters$cores) # I used a Linode.com with 20 CPUs and lots of RAM
    out = data.frame(do.call('rbind', out)) # combine results into a large DF
    row.names(out) = NULL # cleanup
    for(p in names(parameters)){ # save tuning parameters in the results DF
        out[, p] = parameters[[p]]
    }
    return(out)
}

# Baseline tuning parameters
baseline = list('M' = 5000, # nb of Monte Carlo replications
                'cores' = 20, # nb of CPUs to use for parallel computation
                'N' = 1000, # nb of obs in each dataset
                'theta_1' = 0, # share of missing observations (0 ~> 50%)
                'theta_y' = 0, # effect of y on missingness
                'theta_x' = 1, # effect of x1 and x2 on missingness
		        'copula' = 'Normal') # multivariate distribution (Normal, Gumbel, Clayton, Frank)

# Variations of the tuning parameters
tuning = list()
for(theta_1 in -2:2){ # share of missingness
    for(theta_y in 0:1){ # selection on the dependent variables
        for(theta_x in 0:1){ # selection on the independent variables
            for(copula in c('Normal', 'Gumbel', 'Clayton', 'Frank')){
                key = c('theta_1', 'theta_y', 'theta_x', 'copula')
                val = list(theta_1, theta_y, theta_x, copula)
                tmp = replace(baseline, key, val)
                tuning = c(tuning, list(tmp))
            }
        }
    }
}

# Seeds -- hackish, but required for replicability via parallel L'Écuyer 
set.seed(1024)
seeds = sample(1:100000, length(tuning))
for(i in seq_along(seeds)){
    tuning[[i]]$seed = seeds[i]
}

# Monte Carlo experiment
results = NULL
for(i in seq_along(tuning)){ 
    cat(i, '/', length(tuning), '\n') # progress report
    results[[i]] = montecarlo(tuning[[i]]) # real work
}
results = dplyr::bind_rows(results)

# Plot results
tmp = results %>% 
      dplyr::select(matches('dat_|missing|theta|copula')) %>%
      dplyr::group_by(theta_1, theta_y, theta_x, copula) %>%
      dplyr::summarize(`LWD` = mean(abs(dat_full - dat_lwd) / dat_full) * 100,
                       `Amelia y` = mean(abs(dat_full - dat_miss_y_amelia) / dat_full) * 100,
                       `Amelia x1` = mean(abs(dat_full - dat_miss_x1_amelia) / dat_full) * 100,
                       `Amelia x2` = mean(abs(dat_full - dat_miss_x2_amelia) / dat_full) * 100,
                       `Missing share` = mean(missing_share)) %>%
      dplyr::mutate(Mechanism = ifelse((theta_y == 0) & (theta_x == 0), 'MCAR', NA),
                    Mechanism = ifelse((theta_y == 1) & (theta_x == 1), 'DV & IV', Mechanism),
                    Mechanism = ifelse((theta_y == 1) & (theta_x == 0), 'DV', Mechanism),
                    Mechanism = ifelse((theta_y == 0) & (theta_x == 1), 'IV', Mechanism)) %>%
      ungroup %>%
      dplyr::select(-matches('theta')) %>%
      tidyr::gather(Estimator, MAD, -Mechanism, -`Missing share`, -copula)

p = ggplot(tmp, aes(`Missing share`, MAD, shape = Estimator)) + 
    geom_point() +
    geom_line(linetype=3) + 
    facet_grid(copula ~ Mechanism) + 
    theme_bw() +
    theme(strip.background=element_blank(),
          legend.title=element_blank(),
          panel.grid=element_blank()) +
    ylab('Mean absolute deviation from the full-data estimate (%)')
    xlab('Share of incomplete observations')
ggsave(p, file = 'montecarlo.pdf', width = 9, height = 7)
